Homepage
The formatted source code for this file is here.
And a raw version here.
Previous work by Youngser Park can be found here.

1 Introduction

Following from previous pages, this page will focus on filtering the data before clustering to explore if filtering improves the outcome of clustering.

2 Data

Here we read in the data and select a random half of it for exploration.

featFull <- fread("../data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
locFull <- fread("../data/synapsinR_7thA.tif.Pivots.txt",showProgress=FALSE)

### Setting a seed and creating an index vector
### to select half of the data
set.seed(2^10)
half1 <- sample(dim(featFull)[1],dim(featFull)[1]/2)
half2 <- setdiff(1:dim(featFull)[1],half1)

feat <- featFull[half1,]
loc <- locFull[half1,]
dim(feat)
# [1] 559649    144
## Setting the channel names
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
              'psd','glur2','nmdar1','nr2b','gad','VGAT',
              'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
              'NOS','TH','VACht','Synapo','tubuli','DAPI')

## Setting the channel types
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
                  'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
                  'in.pre','in.post','in.post','in.post','in.pre.small','other',
                  'ex.post','other','other','ex.post','none','none')

nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel

## Createing factor variables for channel and channel type sorted properly
ffchannel <- (factor(channel.type,
    levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
    ))
fchannel <- as.numeric(factor(channel.type,
    levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
    ))
ford <- order(fchannel)


## Setting up colors for channel types
Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","mediumblue","gold")
ccol <- Syncol[fchannel]

exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE)
exCol<-exType;levels(exCol) <- c("#197300","#990000","mediumblue");
exCol <- as.character(exCol)

fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)

2.1 Data transformations

f <- lapply(1:6,function(x){seq(x,ncol(feat),by=nfeat)})
featF <- lapply(f,function(x){subset(feat,select=x)})

featF0 <- featF[[1]]
f01e3 <- 1e3*data.table(apply(X=featF0, 2, function(x){((x-min(x))/(max(x)-min(x)))}))

fs <- f01e3

### Taking log_10 on data with 0's removed
ans <- apply(featF0, 1, function(row){ any(row == 0)})

logF0 <- log10(featF0[!ans,])
slogF0 <- logF0[,lapply(.SD,scale, center=TRUE,scale=TRUE)]

We now have the following data sets:

  • featF0: The feature vector looking only at the integrated brightness features.
  • fs: The feature vector scaled between \([0,1000]\).
  • logF0: The feature vector, with 0’s removed, then \(log_{10}\) is applied.
  • slogF0: The feature vector, with 0’s removed, then \(log_{10}\), then scaled by subtracting the mean and dividing by the sample standard deviation.

3 Kernel Density Estimates of the marginals

df1 <- melt(as.matrix(fs))
names(df1) <- c("ind","channel","value")
df1$type <- factor(rep(ffchannel,each=dim(fs)[1]),levels=levels(ffchannel))

lvo <- c(1:5,7:10,19,22,11:16,6,17,18,20,21,23,24)
levels(df1$channel)<-levels(df1$channel)[lvo]

ts <- 22

gg1 <- ggplot(df1, aes(x=value)) + 
    scale_color_manual(values=ccol[lvo]) +
    scale_fill_manual(values=ccol[lvo]) +
    geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=100) +
    geom_density(aes(group=channel, color=channel),size=1.5) +
    facet_wrap( ~ channel, scale='free', ncol=6) +
    theme(plot.title=element_text(size=ts),
          axis.title.x=element_text(size=ts),
          axis.title.y=element_text(size=ts),
          legend.title=element_text(size=ts),
          legend.text=element_text(size=ts-2),
          axis.text=element_text(size=ts-2),
          strip.text=element_text(size=ts), 
          legend.position='none')+
    ggtitle("Kernel Density Estimates of `fs` data.")

print(gg1)
Figure 1: Kernel density estimates for each channel, on fs data.

4 Correlations

cmatfs <- cor(fs)
corrplot(cmatfs,method="color",tl.col=ccol[ford], tl.cex=1)
Figure 2: Correlation on untransformed F0 data, reordered by synapse type.

5 PCA on the Correlation Matrix

pcaf0 <- prcomp(featF0,scale=TRUE, center=TRUE)
pcafs <- prcomp(fs,scale=FALSE, center=FALSE)
elpcaf0 <- getElbows(pcaf0$sdev, plot=FALSE)
elpcafs <- getElbows(pcafs$sdev, plot=FALSE)

6 K-means++ for \(K=2\) (Level 1).

We run K-means++ for \(K=2\) on the fs data.

K1 <- c(2)  ## The set of K's.

set.seed(2^13)
k0 <- kmpp(fs, k=2)
kl <- list()
kl[[1]] <- list(c1 = k0$cluster == 1,c2 = k0$cluster==2, k1 = k0)

6.1 Within cluster correlations (Lv 1)

corkp1 <- cor(fs[kl[[1]]$c1,])
corkp2 <- cor(fs[kl[[1]]$c2,])

par(mfrow=c(2,2))
corrplot(corkp1,method="color",tl.col=ccol[ford], tl.cex=0.8)
corrplot(corkp2,method="color",tl.col=ccol[ford], tl.cex=0.8)
corrplot(sqrt((corkp1 - corkp2)^2),method="color",tl.col=ccol[ford], tl.cex=0.8)
Figure 3: Within cluster correlations, clock-wise from top left, Cluster 1, Cluster 2, l2 distance between C1 and C2

Notice that the non-synaptic markers change very little between clusters. Also note that the correlations between (gad, VGAT, PV, Gephyr) and VGlut1 at both times change significantly between clusters.

6.2 Heat maps (Lv 1):

## Formatting data for heatmap
aggp <- aggregate(fs,by=list(lab=kl[[1]]$k1$cluster),FUN=mean) # FIX THIS
aggp <- as.matrix(aggp[,-1])
rownames(aggp) <- clusterFraction(kl[[1]]$k1$cluster)

The following are heatmaps generated from clustering via K-means++

heatmap.2(as.matrix(aggp),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1.25,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90,main="Heatmap of `fs` data.") 
#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"
Figure 4: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to synapse type.

Percentage of data within cluster is presented on the right side of the heatmap.

6.3 Kernel Density Estimates of the marginals | cluster (Lv 1)

Here we look at the kernel density estimates within each cluster to compare.

df2 <- melt(as.matrix(fs))
names(df2) <- c("ind","channel","value")
df2$cluster <- kl[[1]]$k1$cluster
df2$type <- factor(rep(ffchannel,each=dim(fs)[1]),levels=levels(ffchannel))

gg2 <- ggplot(df2, aes(x=value)) + 
    scale_colour_manual(values=ccol) + 
    scale_x_continuous(limits=c(0,400)) +
    geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=250) +
    geom_density(aes(group=channel, color=channel),size=1.5) +
    facet_grid(channel ~ cluster, scale='free') + 
    theme(strip.text.y=element_text(angle=0)) +
    guides(col=guide_legend(ncol=1))
print(gg2)
Figure 5: Kernel density estimates for each channel, on fs data given cluster from km++

6.4 Clusters and Spatial Location (Lv 1)

Using the location data and the results of K-means++ we show a 3d scatter plot colored accoding to cluster.

set.seed(2^12)
s1 <- sample(dim(loc)[1],5e4)

locs1 <- loc[s1,]
locs1$cluster <- kl[[1]]$k1$cluster[s1]

plot3d(locs1$V1,locs1$V2,locs1$V3,
       col=ifelse(locs1$cluster==1,'#d95f02','#6a3d9a'), #orange,purple
       alpha=0.75,
       xlab='x', 
       ylab='y', 
       zlab='z')

subid <- currentSubscene3d()
rglwidget(elementId="plot3dLocations")

7 K-means++ for \(K=2\) (Level 2).

We run the next level of K-means++ for \(K=2\) on the fs data.

###
###
### Naming convention: 
###     k<numeric> := kmpp on the <numeric>th node
###         in the tree as read in a binary tree 
###
###     c<numeric> := the logical vector of class membership for 
###         <numeric>th cluster.
###      
K2 <- c(2)  ## The set of K's.

set.seed(2^13)
k1 <-kmpp(fs[kl[[1]]$c1,], k = K2)
set.seed(2^13)
k2 <-kmpp(fs[kl[[1]]$c2,], k = K2)
kl[[2]] <- list(c11 = k1$cluster == 1, c12 = k1$cluster==2, 
                c21 = k2$cluster == 1, c22 = k2$cluster==2, k1=k1,k2=k2)

7.1 Within cluster correlations (Lv 2)

corkp11 <- cor(featF0[kl[[1]]$c1,][kl[[2]]$c11,])
corkp12 <- cor(featF0[kl[[1]]$c1,][kl[[2]]$c12,])
corkp21 <- cor(featF0[kl[[1]]$c2,][kl[[2]]$c21,])
corkp22 <- cor(featF0[kl[[1]]$c2,][kl[[2]]$c22,])

par(mfrow=c(1,4))
corrplot(corkp11,method="color",tl.col=ccol[ford], tl.cex=0.8)
corrplot(corkp12,method="color",tl.col=ccol[ford], tl.cex=0.8)
corrplot(corkp21,method="color",tl.col=ccol[ford], tl.cex=0.8)
corrplot(corkp22,method="color",tl.col=ccol[ford], tl.cex=0.8)
Figure 6: Within cluster correlations for level 2. (c11, c12, c21, c22)

7.2 Heat maps (Lv 2):

## Formatting data for heatmap
labs2 <- c(kl[[2]]$k1$cluster,kl[[2]]$k2$cluster+2)
aggp2 <- aggregate(fs,by=list(lab=labs2),FUN=function(x){mean(x)}) # FIX THIS
aggp2 <- as.matrix(aggp2[,-1])
rownames(aggp2) <- clusterFraction(labs2)

The following are heatmaps generated from clustering via K-means++

heatmap.2(as.matrix(aggp2),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1.25,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90,main="Heatmap of `fs` data.") 
#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"
Figure 7: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to synapse type.

Percentage of data within cluster is presented on the right side of the heatmap.

7.3 Kernel Density Estimates of the marginals | cluster (Lv 2)

Here we look at the kernel density estimates within each cluster to compare.

df2 <- melt(as.matrix(fs))
names(df2) <- c("ind","channel","value")
df2$cluster <- labs2
df2$type <- factor(rep(ffchannel,each=dim(fs)[1]),levels=levels(ffchannel))

gg3 <- ggplot(df2, aes(x=value)) + 
    scale_colour_manual(values=ccol) + 
    #scale_x_continuous(limits=c(0,400)) +
    geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=250) +
    geom_density(aes(group=channel),size=.5, color='black', alpha=0.5) +
    facet_grid(channel ~ cluster, scale='free') + 
    theme(strip.text.y=element_text(angle=0)) +
    guides(col=guide_legend(ncol=1))
print(gg3)
Figure 8: Kernel density estimates for each channel, on fs data given cluster from km++ level 2

7.4 Clusters and Spatial Location (Lv 2)

Using the location data and the results of K-means++ we show a 3d scatter plot colored according to cluster.

set.seed(2^12)
s1 <- sample(dim(loc)[1],5e4)

locs1 <- loc[s1,]
locs1$cluster <- c(kl[[2]]$k1$cluster,kl[[2]]$k2$cluster+2)[s1]

plot3d(locs1$V1,locs1$V2,locs1$V3,
       col=brewer.pal(4,"BrBG")[locs1$cluster],#ifelse(locs1$cluster==1,'#d95f02','#6a3d9a'), #orange,purple
       alpha=0.75,
       xlab='x', 
       ylab='y', 
       zlab='z'
       )

subid <- currentSubscene3d()
rglwidget(elementId="plot3dLocationsLV2")

7.5 GABABR

## re-formatting data for use in lattice 
d1gab <- data.table(stack(fs, select=-GABABRF0))[,.(values)]
d1gab$GABABR <- fs$GABABRF0

### Adding relationship factor variables
nd <- paste0("GABABR","~",abbreviate(channel[-which(channel=="GABABR")]))

d1gab$ind <- factor(rep(nd,each=dim(fs)[1]),ordered=TRUE,levels=nd)

names(d1gab) <- c("x","y","g")

lat1 <- xyplot(y ~ x | g, data=d1gab,
       as.table=TRUE,
       colramp=BTC,
       pch='.',
       scales = list(y = list(relation = "free"),x = list(relation = "free")),
       panel=function(x,y,...){
           panel.hexbinplot(x,y,..., type='g')
           panel.loess(x,y,col='red', lwd=2,...)
        }
       )
gg3 <- ggplot(data=d1gab,aes(x=x,y=y, group=g)) +   
        geom_point(pch='.',alpha=0.2) + 
        geom_hex(bins=100) +
        geom_smooth(method='lm',colour='red', alpha=0.7)+
        facet_wrap( ~ g, scales='free_x') 
print(gg3)
Figure 9: Pairs plots of GABABR and all other markers with regression lines.